## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.2.0 ✓ stringr 1.4.0
## ✓ readr 2.1.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
popdensity by state.county by state.
ggplot(data = midwest) from
above.
alpha instead.
colors().
ggplot(midwest) +
geom_point(aes(x = popdensity, y = percollege, color = state)) +
geom_smooth(aes(x = popdensity, y = percollege, color = state),
se = FALSE)ggplot(midwest,
aes(x = popdensity, y = percollege, color = state)) +
geom_point() +
geom_smooth(se = FALSE)ggplot(midwest,
aes(x = popdensity, y = percollege, color = state)) +
geom_point() +
scale_x_continuous("Population Density",
breaks = seq(0, 80000, 20000)) +
scale_y_continuous("Percent College Graduates") +
scale_color_discrete("State") +
labs(title = "Percent College Educated by Population Density",
subtitle = "County level data for five midwest states")## [1] 0
## [1] 35
## [1] 2
## [1] 2
## [1] 4
## [1] 4
## [1] 12
## Error in eval(expr, envir, enclos): object 'Case_sensitive' not found
## [1] -0.6264538 0.1836433 -0.8356286 1.5952808 0.3295078
## [1] -0.6264538 0.1836433 -0.8356286 1.5952808 0.3295078
## [1] -0.6264538 0.1836433 -0.8356286 1.5952808 0.3295078
head() is your friend## # A tibble: 6 × 28
## PID county state area poptotal popdensity popwhite popblack popamerindian
## <int> <chr> <chr> <dbl> <int> <dbl> <int> <int> <int>
## 1 561 ADAMS IL 0.052 66090 1271. 63917 1702 98
## 2 562 ALEXAND… IL 0.014 10626 759 7054 3496 19
## 3 563 BOND IL 0.022 14991 681. 14477 429 35
## 4 564 BOONE IL 0.017 30806 1812. 29344 127 46
## 5 565 BROWN IL 0.018 5836 324. 5264 547 14
## 6 566 BUREAU IL 0.05 35688 714. 35157 50 65
## # … with 19 more variables: popasian <int>, popother <int>, percwhite <dbl>,
## # percblack <dbl>, percamerindan <dbl>, percasian <dbl>, percother <dbl>,
## # popadults <int>, perchsd <dbl>, percollege <dbl>, percprof <dbl>,
## # poppovertyknown <int>, percpovertyknown <dbl>, percbelowpoverty <dbl>,
## # percchildbelowpovert <dbl>, percadultpoverty <dbl>,
## # percelderlypoverty <dbl>, inmetro <int>, category <chr>
str()## tibble [437 × 28] (S3: tbl_df/tbl/data.frame)
## $ PID : int [1:437] 561 562 563 564 565 566 567 568 569 570 ...
## $ county : chr [1:437] "ADAMS" "ALEXANDER" "BOND" "BOONE" ...
## $ state : chr [1:437] "IL" "IL" "IL" "IL" ...
## $ area : num [1:437] 0.052 0.014 0.022 0.017 0.018 0.05 0.017 0.027 0.024 0.058 ...
## $ poptotal : int [1:437] 66090 10626 14991 30806 5836 35688 5322 16805 13437 173025 ...
## $ popdensity : num [1:437] 1271 759 681 1812 324 ...
## $ popwhite : int [1:437] 63917 7054 14477 29344 5264 35157 5298 16519 13384 146506 ...
## $ popblack : int [1:437] 1702 3496 429 127 547 50 1 111 16 16559 ...
## $ popamerindian : int [1:437] 98 19 35 46 14 65 8 30 8 331 ...
## $ popasian : int [1:437] 249 48 16 150 5 195 15 61 23 8033 ...
## $ popother : int [1:437] 124 9 34 1139 6 221 0 84 6 1596 ...
## $ percwhite : num [1:437] 96.7 66.4 96.6 95.3 90.2 ...
## $ percblack : num [1:437] 2.575 32.9 2.862 0.412 9.373 ...
## $ percamerindan : num [1:437] 0.148 0.179 0.233 0.149 0.24 ...
## $ percasian : num [1:437] 0.3768 0.4517 0.1067 0.4869 0.0857 ...
## $ percother : num [1:437] 0.1876 0.0847 0.2268 3.6973 0.1028 ...
## $ popadults : int [1:437] 43298 6724 9669 19272 3979 23444 3583 11323 8825 95971 ...
## $ perchsd : num [1:437] 75.1 59.7 69.3 75.5 68.9 ...
## $ percollege : num [1:437] 19.6 11.2 17 17.3 14.5 ...
## $ percprof : num [1:437] 4.36 2.87 4.49 4.2 3.37 ...
## $ poppovertyknown : int [1:437] 63628 10529 14235 30337 4815 35107 5241 16455 13081 154934 ...
## $ percpovertyknown : num [1:437] 96.3 99.1 95 98.5 82.5 ...
## $ percbelowpoverty : num [1:437] 13.15 32.24 12.07 7.21 13.52 ...
## $ percchildbelowpovert: num [1:437] 18 45.8 14 11.2 13 ...
## $ percadultpoverty : num [1:437] 11.01 27.39 10.85 5.54 11.14 ...
## $ percelderlypoverty : num [1:437] 12.44 25.23 12.7 6.22 19.2 ...
## $ inmetro : int [1:437] 0 0 0 1 0 0 0 0 0 1 ...
## $ category : chr [1:437] "AAR" "LHR" "AAR" "ALU" ...
summary()## PID county state area
## Min. : 561 Length:437 Length:437 Min. :0.00500
## 1st Qu.: 670 Class :character Class :character 1st Qu.:0.02400
## Median :1221 Mode :character Mode :character Median :0.03000
## Mean :1437 Mean :0.03317
## 3rd Qu.:2059 3rd Qu.:0.03800
## Max. :3052 Max. :0.11000
## poptotal popdensity popwhite popblack
## Min. : 1701 Min. : 85.05 Min. : 416 Min. : 0
## 1st Qu.: 18840 1st Qu.: 622.41 1st Qu.: 18630 1st Qu.: 29
## Median : 35324 Median : 1156.21 Median : 34471 Median : 201
## Mean : 96130 Mean : 3097.74 Mean : 81840 Mean : 11024
## 3rd Qu.: 75651 3rd Qu.: 2330.00 3rd Qu.: 72968 3rd Qu.: 1291
## Max. :5105067 Max. :88018.40 Max. :3204947 Max. :1317147
## popamerindian popasian popother percwhite
## Min. : 4.0 Min. : 0 Min. : 0 Min. :10.69
## 1st Qu.: 44.0 1st Qu.: 35 1st Qu.: 20 1st Qu.:94.89
## Median : 94.0 Median : 102 Median : 66 Median :98.03
## Mean : 343.1 Mean : 1310 Mean : 1613 Mean :95.56
## 3rd Qu.: 288.0 3rd Qu.: 401 3rd Qu.: 345 3rd Qu.:99.07
## Max. :10289.0 Max. :188565 Max. :384119 Max. :99.82
## percblack percamerindan percasian percother
## Min. : 0.0000 Min. : 0.05623 Min. :0.0000 Min. :0.00000
## 1st Qu.: 0.1157 1st Qu.: 0.15793 1st Qu.:0.1737 1st Qu.:0.09102
## Median : 0.5390 Median : 0.21502 Median :0.2972 Median :0.17844
## Mean : 2.6763 Mean : 0.79894 Mean :0.4872 Mean :0.47906
## 3rd Qu.: 2.6014 3rd Qu.: 0.38362 3rd Qu.:0.5212 3rd Qu.:0.48050
## Max. :40.2100 Max. :89.17738 Max. :5.0705 Max. :7.52427
## popadults perchsd percollege percprof
## Min. : 1287 Min. :46.91 Min. : 7.336 Min. : 0.5203
## 1st Qu.: 12271 1st Qu.:71.33 1st Qu.:14.114 1st Qu.: 2.9980
## Median : 22188 Median :74.25 Median :16.798 Median : 3.8142
## Mean : 60973 Mean :73.97 Mean :18.273 Mean : 4.4473
## 3rd Qu.: 47541 3rd Qu.:77.20 3rd Qu.:20.550 3rd Qu.: 4.9493
## Max. :3291995 Max. :88.90 Max. :48.079 Max. :20.7913
## poppovertyknown percpovertyknown percbelowpoverty percchildbelowpovert
## Min. : 1696 Min. :80.90 Min. : 2.180 Min. : 1.919
## 1st Qu.: 18364 1st Qu.:96.89 1st Qu.: 9.199 1st Qu.:11.624
## Median : 33788 Median :98.17 Median :11.822 Median :15.270
## Mean : 93642 Mean :97.11 Mean :12.511 Mean :16.447
## 3rd Qu.: 72840 3rd Qu.:98.60 3rd Qu.:15.133 3rd Qu.:20.352
## Max. :5023523 Max. :99.86 Max. :48.691 Max. :64.308
## percadultpoverty percelderlypoverty inmetro category
## Min. : 1.938 Min. : 3.547 Min. :0.0000 Length:437
## 1st Qu.: 7.668 1st Qu.: 8.912 1st Qu.:0.0000 Class :character
## Median :10.008 Median :10.869 Median :0.0000 Mode :character
## Mean :10.919 Mean :11.389 Mean :0.3432
## 3rd Qu.:13.182 3rd Qu.:13.412 3rd Qu.:1.0000
## Max. :43.312 Max. :31.162 Max. :1.0000
?function_name to explore the details of the
function. The examples at the bottom of every R help page can be
especially helpful.dplyr for data manipulationdplyr basicsThe dplyr package uses verbs for common data
manipulation tasks. These include:
filter()count()arrange()select()mutate()summarise()https://allisonhorst.github.io/palmerpenguins/
## # A tibble: 6 × 8
## species island bill_length_mm bill_depth_mm flipper_length_… body_mass_g sex
## <fct> <fct> <dbl> <dbl> <int> <int> <fct>
## 1 Adelie Torge… 39.1 18.7 181 3750 male
## 2 Adelie Torge… 39.5 17.4 186 3800 fema…
## 3 Adelie Torge… 40.3 18 195 3250 fema…
## 4 Adelie Torge… NA NA NA NA <NA>
## 5 Adelie Torge… 36.7 19.3 193 3450 fema…
## 6 Adelie Torge… 39.3 20.6 190 3650 male
## # … with 1 more variable: year <int>
filter## # A tibble: 110 × 8
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Torgersen 39.1 18.7 181 3750
## 2 Adelie Torgersen 39.5 17.4 186 3800
## 3 Adelie Torgersen 40.3 18 195 3250
## 4 Adelie Torgersen NA NA NA NA
## 5 Adelie Torgersen 36.7 19.3 193 3450
## 6 Adelie Torgersen 39.3 20.6 190 3650
## 7 Adelie Torgersen 38.9 17.8 181 3625
## 8 Adelie Torgersen 39.2 19.6 195 4675
## 9 Adelie Torgersen 34.1 18.1 193 3475
## 10 Adelie Torgersen 42 20.2 190 4250
## # … with 100 more rows, and 2 more variables: sex <fct>, year <int>
## # A tibble: 110 × 8
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Torgersen 39.1 18.7 181 3750
## 2 Adelie Torgersen 39.5 17.4 186 3800
## 3 Adelie Torgersen 40.3 18 195 3250
## 4 Adelie Torgersen NA NA NA NA
## 5 Adelie Torgersen 36.7 19.3 193 3450
## 6 Adelie Torgersen 39.3 20.6 190 3650
## 7 Adelie Torgersen 38.9 17.8 181 3625
## 8 Adelie Torgersen 39.2 19.6 195 4675
## 9 Adelie Torgersen 34.1 18.1 193 3475
## 10 Adelie Torgersen 42 20.2 190 4250
## # … with 100 more rows, and 2 more variables: sex <fct>, year <int>
><>=<=is.na function to identify penguins with
missing sex.## # A tibble: 152 × 8
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Torgersen 39.1 18.7 181 3750
## 2 Adelie Torgersen 39.5 17.4 186 3800
## 3 Adelie Torgersen 40.3 18 195 3250
## 4 Adelie Torgersen NA NA NA NA
## 5 Adelie Torgersen 36.7 19.3 193 3450
## 6 Adelie Torgersen 39.3 20.6 190 3650
## 7 Adelie Torgersen 38.9 17.8 181 3625
## 8 Adelie Torgersen 39.2 19.6 195 4675
## 9 Adelie Torgersen 34.1 18.1 193 3475
## 10 Adelie Torgersen 42 20.2 190 4250
## # … with 142 more rows, and 2 more variables: sex <fct>, year <int>
## # A tibble: 0 × 8
## # … with 8 variables: species <fct>, island <fct>, bill_length_mm <dbl>,
## # bill_depth_mm <dbl>, flipper_length_mm <int>, body_mass_g <int>, sex <fct>,
## # year <int>
## # A tibble: 50 × 8
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Torgersen 39.1 18.7 181 3750
## 2 Adelie Torgersen 39.5 17.4 186 3800
## 3 Adelie Torgersen 40.3 18 195 3250
## 4 Adelie Torgersen NA NA NA NA
## 5 Adelie Torgersen 36.7 19.3 193 3450
## 6 Adelie Torgersen 39.3 20.6 190 3650
## 7 Adelie Torgersen 38.9 17.8 181 3625
## 8 Adelie Torgersen 39.2 19.6 195 4675
## 9 Adelie Torgersen 34.1 18.1 193 3475
## 10 Adelie Torgersen 42 20.2 190 4250
## # … with 40 more rows, and 2 more variables: sex <fct>, year <int>
## # A tibble: 50 × 8
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Torgersen 39.1 18.7 181 3750
## 2 Adelie Torgersen 39.5 17.4 186 3800
## 3 Adelie Torgersen 40.3 18 195 3250
## 4 Adelie Torgersen NA NA NA NA
## 5 Adelie Torgersen 36.7 19.3 193 3450
## 6 Adelie Torgersen 39.3 20.6 190 3650
## 7 Adelie Torgersen 38.9 17.8 181 3625
## 8 Adelie Torgersen 39.2 19.6 195 4675
## 9 Adelie Torgersen 34.1 18.1 193 3475
## 10 Adelie Torgersen 42 20.2 190 4250
## # … with 40 more rows, and 2 more variables: sex <fct>, year <int>
## # A tibble: 224 × 8
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Torgersen 39.1 18.7 181 3750
## 2 Adelie Torgersen 39.5 17.4 186 3800
## 3 Adelie Torgersen 40.3 18 195 3250
## 4 Adelie Torgersen NA NA NA NA
## 5 Adelie Torgersen 36.7 19.3 193 3450
## 6 Adelie Torgersen 39.3 20.6 190 3650
## 7 Adelie Torgersen 38.9 17.8 181 3625
## 8 Adelie Torgersen 39.2 19.6 195 4675
## 9 Adelie Torgersen 34.1 18.1 193 3475
## 10 Adelie Torgersen 42 20.2 190 4250
## # … with 214 more rows, and 2 more variables: sex <fct>, year <int>
%in%## # A tibble: 224 × 8
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Torgersen 39.1 18.7 181 3750
## 2 Adelie Torgersen 39.5 17.4 186 3800
## 3 Adelie Torgersen 40.3 18 195 3250
## 4 Adelie Torgersen NA NA NA NA
## 5 Adelie Torgersen 36.7 19.3 193 3450
## 6 Adelie Torgersen 39.3 20.6 190 3650
## 7 Adelie Torgersen 38.9 17.8 181 3625
## 8 Adelie Torgersen 39.2 19.6 195 4675
## 9 Adelie Torgersen 34.1 18.1 193 3475
## 10 Adelie Torgersen 42 20.2 190 4250
## # … with 214 more rows, and 2 more variables: sex <fct>, year <int>
## # A tibble: 234 × 8
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Biscoe 39.6 17.7 186 3500
## 2 Adelie Biscoe 40.1 18.9 188 4300
## 3 Adelie Biscoe 35 17.9 190 3450
## 4 Adelie Biscoe 42 19.5 200 4050
## 5 Adelie Biscoe 34.5 18.1 187 2900
## 6 Adelie Biscoe 41.4 18.6 191 3700
## 7 Adelie Biscoe 39 17.5 186 3550
## 8 Adelie Biscoe 40.6 18.8 193 3800
## 9 Adelie Biscoe 36.5 16.6 181 2850
## 10 Adelie Biscoe 37.6 19.1 194 3750
## # … with 224 more rows, and 2 more variables: sex <fct>, year <int>
## # A tibble: 60 × 8
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Gentoo Biscoe 46.1 13.2 211 4500
## 2 Gentoo Biscoe 50 16.3 230 5700
## 3 Gentoo Biscoe 48.7 14.1 210 4450
## 4 Gentoo Biscoe 50 15.2 218 5700
## 5 Gentoo Biscoe 47.6 14.5 215 5400
## 6 Gentoo Biscoe 46.5 13.5 210 4550
## 7 Gentoo Biscoe 45.4 14.6 211 4800
## 8 Gentoo Biscoe 46.7 15.3 219 5200
## 9 Gentoo Biscoe 43.3 13.4 209 4400
## 10 Gentoo Biscoe 46.8 15.4 215 5150
## # … with 50 more rows, and 2 more variables: sex <fct>, year <int>
gss_cat data.count## # A tibble: 3 × 2
## year n
## <int> <int>
## 1 2007 110
## 2 2008 114
## 3 2009 120
## # A tibble: 3 × 2
## island n
## <fct> <int>
## 1 Biscoe 168
## 2 Dream 124
## 3 Torgersen 52
arrange## # A tibble: 344 × 8
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Dream 32.1 15.5 188 3050
## 2 Adelie Dream 33.1 16.1 178 2900
## 3 Adelie Torgersen 33.5 19 190 3600
## 4 Adelie Dream 34 17.1 185 3400
## 5 Adelie Torgersen 34.1 18.1 193 3475
## 6 Adelie Torgersen 34.4 18.4 184 3325
## 7 Adelie Biscoe 34.5 18.1 187 2900
## 8 Adelie Torgersen 34.6 21.1 198 4400
## 9 Adelie Torgersen 34.6 17.2 189 3200
## 10 Adelie Biscoe 35 17.9 190 3450
## # … with 334 more rows, and 2 more variables: sex <fct>, year <int>
## # A tibble: 344 × 8
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Dream 32.1 15.5 188 3050
## 2 Adelie Dream 33.1 16.1 178 2900
## 3 Adelie Torgersen 33.5 19 190 3600
## 4 Adelie Dream 34 17.1 185 3400
## 5 Adelie Torgersen 34.1 18.1 193 3475
## 6 Adelie Torgersen 34.4 18.4 184 3325
## 7 Adelie Biscoe 34.5 18.1 187 2900
## 8 Adelie Torgersen 34.6 21.1 198 4400
## 9 Adelie Torgersen 34.6 17.2 189 3200
## 10 Adelie Biscoe 35 17.9 190 3450
## # … with 334 more rows, and 2 more variables: sex <fct>, year <int>
## # A tibble: 344 × 8
## species island bill_length_mm bill_depth_mm flipper_length_… body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Torgersen 46 21.5 194 4200
## 2 Adelie Torgersen 38.6 21.2 191 3800
## 3 Adelie Dream 42.3 21.2 191 4150
## 4 Adelie Torgersen 34.6 21.1 198 4400
## 5 Adelie Dream 39.2 21.1 196 4150
## 6 Adelie Biscoe 41.3 21.1 195 4400
## 7 Chinstrap Dream 54.2 20.8 201 4300
## 8 Adelie Torgersen 42.5 20.7 197 4500
## 9 Adelie Biscoe 39.6 20.7 191 3900
## 10 Chinstrap Dream 52 20.7 210 4800
## # … with 334 more rows, and 2 more variables: sex <fct>, year <int>
gss_cat data, count the number within each
political party (partyid).select## # A tibble: 344 × 4
## island bill_depth_mm flipper_length_mm sex
## <fct> <dbl> <int> <fct>
## 1 Torgersen 18.7 181 male
## 2 Torgersen 17.4 186 female
## 3 Torgersen 18 195 female
## 4 Torgersen NA NA <NA>
## 5 Torgersen 19.3 193 female
## 6 Torgersen 20.6 190 male
## 7 Torgersen 17.8 181 female
## 8 Torgersen 19.6 195 male
## 9 Torgersen 18.1 193 <NA>
## 10 Torgersen 20.2 190 <NA>
## # … with 334 more rows
starts_with()ends_with()contains()matches()num_range():everything()starts_with helper## # A tibble: 344 × 2
## species sex
## <fct> <fct>
## 1 Adelie male
## 2 Adelie female
## 3 Adelie female
## 4 Adelie <NA>
## 5 Adelie female
## 6 Adelie male
## 7 Adelie female
## 8 Adelie male
## 9 Adelie <NA>
## 10 Adelie <NA>
## # … with 334 more rows
## # A tibble: 344 × 2
## bill_length_mm flipper_length_mm
## <dbl> <int>
## 1 39.1 181
## 2 39.5 186
## 3 40.3 195
## 4 NA NA
## 5 36.7 193
## 6 39.3 190
## 7 38.9 181
## 8 39.2 195
## 9 34.1 193
## 10 42 190
## # … with 334 more rows
## # A tibble: 344 × 4
## flipper_length_mm body_mass_g sex year
## <int> <int> <fct> <int>
## 1 181 3750 male 2007
## 2 186 3800 female 2007
## 3 195 3250 female 2007
## 4 NA NA <NA> 2007
## 5 193 3450 female 2007
## 6 190 3650 male 2007
## 7 181 3625 female 2007
## 8 195 4675 male 2007
## 9 193 3475 <NA> 2007
## 10 190 4250 <NA> 2007
## # … with 334 more rows
## # A tibble: 344 × 4
## species body_mass_g sex year
## <fct> <int> <fct> <int>
## 1 Adelie 3750 male 2007
## 2 Adelie 3800 female 2007
## 3 Adelie 3250 female 2007
## 4 Adelie NA <NA> 2007
## 5 Adelie 3450 female 2007
## 6 Adelie 3650 male 2007
## 7 Adelie 3625 female 2007
## 8 Adelie 4675 male 2007
## 9 Adelie 3475 <NA> 2007
## 10 Adelie 4250 <NA> 2007
## # … with 334 more rows
everything## # A tibble: 344 × 8
## species island sex year bill_length_mm bill_depth_mm flipper_length_mm
## <fct> <fct> <fct> <int> <dbl> <dbl> <int>
## 1 Adelie Torgersen male 2007 39.1 18.7 181
## 2 Adelie Torgersen female 2007 39.5 17.4 186
## 3 Adelie Torgersen female 2007 40.3 18 195
## 4 Adelie Torgersen <NA> 2007 NA NA NA
## 5 Adelie Torgersen female 2007 36.7 19.3 193
## 6 Adelie Torgersen male 2007 39.3 20.6 190
## 7 Adelie Torgersen female 2007 38.9 17.8 181
## 8 Adelie Torgersen male 2007 39.2 19.6 195
## 9 Adelie Torgersen <NA> 2007 34.1 18.1 193
## 10 Adelie Torgersen <NA> 2007 42 20.2 190
## # … with 334 more rows, and 1 more variable: body_mass_g <int>
rename function## # A tibble: 344 × 8
## penguin_species penguin_island bill_length_mm bill_depth_mm flipper_length_mm
## <fct> <fct> <dbl> <dbl> <int>
## 1 Adelie Torgersen 39.1 18.7 181
## 2 Adelie Torgersen 39.5 17.4 186
## 3 Adelie Torgersen 40.3 18 195
## 4 Adelie Torgersen NA NA NA
## 5 Adelie Torgersen 36.7 19.3 193
## 6 Adelie Torgersen 39.3 20.6 190
## 7 Adelie Torgersen 38.9 17.8 181
## 8 Adelie Torgersen 39.2 19.6 195
## 9 Adelie Torgersen 34.1 18.1 193
## 10 Adelie Torgersen 42 20.2 190
## # … with 334 more rows, and 3 more variables: body_mass_g <int>, sex <fct>,
## # year <int>
dplyr helper functions and the
penguins data, select all the attributes that are measured
in millimeters.gss_cat data
to ‘x1’, ‘x2’, ‘x3’.num_range function.https://fivethirtyeight.com/features/both-republicans-and-democrats-have-an-age-problem/
## Some larger datasets need to be installed separately, like senators and
## house_district_forecast. To install these, we recommend you install the
## fivethirtyeightdata package by running:
## install.packages('fivethirtyeightdata', repos =
## 'https://fivethirtyeightdata.github.io/drat/', type = 'source')
## # A tibble: 6 × 13
## congress chamber bioguide firstname middlename lastname suffix birthday
## <int> <chr> <chr> <chr> <chr> <chr> <chr> <date>
## 1 80 house M000112 Joseph Jefferson Mansfield <NA> 1861-02-09
## 2 80 house D000448 Robert Lee Doughton <NA> 1863-11-07
## 3 80 house S000001 Adolph Joachim Sabath <NA> 1866-04-04
## 4 80 house E000023 Charles Aubrey Eaton <NA> 1868-03-29
## 5 80 house L000296 William <NA> Lewis <NA> 1868-09-22
## 6 80 house G000017 James A. Gallagher <NA> 1869-01-16
## # … with 5 more variables: state <chr>, party <chr>, incumbent <lgl>,
## # termstart <date>, age <dbl>
mutatecongress_red <- select(congress_age, congress, chamber, state, party)
mutate(congress_red,
democrat = ifelse(party == 'D', 1, 0),
num_democrat = sum(democrat)
)## # A tibble: 18,635 × 6
## congress chamber state party democrat num_democrat
## <int> <chr> <chr> <chr> <dbl> <dbl>
## 1 80 house TX D 1 10290
## 2 80 house NC D 1 10290
## 3 80 house IL D 1 10290
## 4 80 house NJ R 0 10290
## 5 80 house KY R 0 10290
## 6 80 house PA R 0 10290
## 7 80 house CA R 0 10290
## 8 80 house NY D 1 10290
## 9 80 house WI R 0 10290
## 10 80 house MA R 0 10290
## # … with 18,625 more rows
diamonds data, use ?diamonds for
more information on the data, use the mutate function to
calculate the price per carat. Hint, this operation would involve
standardizing the price variable so that all are comparable at 1
carat.mutate, calculate the rank of the original price
variable and the new price variable calculated above using the
min_rank function. Are there differences in the ranking of
the prices?summarisecongress_2 <- mutate(congress_age,
democrat = ifelse(party == 'D', 1, 0)
)
summarise(congress_2,
num_democrat = sum(democrat)
)## # A tibble: 1 × 1
## num_democrat
## <dbl>
## 1 10290
group_bycongress_grp <- group_by(congress_2, congress)
summarise(congress_grp,
num_democrat = sum(democrat),
total = n(),
prop_democrat = num_democrat / total
)## # A tibble: 34 × 4
## congress num_democrat total prop_democrat
## <int> <dbl> <int> <dbl>
## 1 80 247 555 0.445
## 2 81 330 557 0.592
## 3 82 292 555 0.526
## 4 83 274 557 0.492
## 5 84 288 544 0.529
## 6 85 295 547 0.539
## 7 86 356 554 0.643
## 8 87 339 559 0.606
## 9 88 332 552 0.601
## 10 89 371 548 0.677
## # … with 24 more rows
summarise command above to calculate
these values.sum(democrat) above, we used
mean(democrat), what does this value return? Why does it
return this value?group_by with mutategroup_by with mutate output## # A tibble: 18,635 × 8
## # Groups: congress [34]
## congress chamber state party democrat num_democrat total prop_democrat
## <int> <chr> <chr> <chr> <dbl> <dbl> <int> <dbl>
## 1 80 house TX D 1 247 555 0.445
## 2 80 house NC D 1 247 555 0.445
## 3 80 house IL D 1 247 555 0.445
## 4 80 house NJ R 0 247 555 0.445
## 5 80 house KY R 0 247 555 0.445
## 6 80 house PA R 0 247 555 0.445
## 7 80 house CA R 0 247 555 0.445
## 8 80 house NY D 1 247 555 0.445
## 9 80 house WI R 0 247 555 0.445
## 10 80 house MA R 0 247 555 0.445
## # … with 18,625 more rows
|> is the answerpipe_congress <- congress_age |>
filter(congress >= 100) |>
mutate(democrat = ifelse(party == 'D', 1, 0)) |>
group_by(congress, chamber) |>
summarise(
num_democrat = sum(democrat),
total = n(),
prop_democrat = num_democrat / total
)
nested_congress <- summarise(
group_by(
mutate(
filter(
congress_age, congress >= 100
),
democrat = ifelse(party == 'D', 1, 0)
),
congress, chamber
),
num_democrat = sum(democrat),
total = n(),
prop_democrat = num_democrat / total
)
identical(pipe_congress, nested_congress)## [1] TRUE
summarise(
group_by(
mutate(
filter(
diamonds,
color %in% c('D', 'E', 'F') & cut %in% c('Fair', 'Good', 'Very Good')
),
f_color = ifelse(color == 'F', 1, 0),
vg_cut = ifelse(cut == 'Very Good', 1, 0)
),
clarity
),
avg = mean(carat),
sd = sd(carat),
avg_p = mean(price),
num = n(),
summary_f_color = mean(f_color),
summary_vg_cut = mean(vg_cut)
)rename_with() to rename with a function## # A tibble: 344 × 8
## species island BILL_LENGTH_MM BILL_DEPTH_MM FLIPPER_LENGTH_MM body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Torgersen 39.1 18.7 181 3750
## 2 Adelie Torgersen 39.5 17.4 186 3800
## 3 Adelie Torgersen 40.3 18 195 3250
## 4 Adelie Torgersen NA NA NA NA
## 5 Adelie Torgersen 36.7 19.3 193 3450
## 6 Adelie Torgersen 39.3 20.6 190 3650
## 7 Adelie Torgersen 38.9 17.8 181 3625
## 8 Adelie Torgersen 39.2 19.6 195 4675
## 9 Adelie Torgersen 34.1 18.1 193 3475
## 10 Adelie Torgersen 42 20.2 190 4250
## # … with 334 more rows, and 2 more variables: sex <fct>, year <int>
across()across() function can be helpful to work with
columns.across() with summarise()## # A tibble: 3 × 6
## species bill_length_mm bill_depth_mm flipper_length_mm body_mass_g year
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Adelie 38.8 18.3 190. 3701. 2008.
## 2 Chinstrap 48.8 18.4 196. 3733. 2008.
## 3 Gentoo 47.5 15.0 217. 5076. 2008.
penguins %>%
group_by(species) %>%
summarise(across(where(is.numeric),
list(mean = ~mean(.x, na.rm = TRUE),
n_miss = ~ sum(is.na(.x)))))## # A tibble: 3 × 11
## species bill_length_mm_m… bill_length_mm_… bill_depth_mm_m… bill_depth_mm_n…
## <fct> <dbl> <int> <dbl> <int>
## 1 Adelie 38.8 1 18.3 1
## 2 Chinstrap 48.8 0 18.4 0
## 3 Gentoo 47.5 1 15.0 1
## # … with 6 more variables: flipper_length_mm_mean <dbl>,
## # flipper_length_mm_n_miss <int>, body_mass_g_mean <dbl>,
## # body_mass_g_n_miss <int>, year_mean <dbl>, year_n_miss <int>
penguins %>%
group_by(species) %>%
summarise(across(ends_with('mm'),
list(mean = ~mean(.x, na.rm = TRUE),
n_miss = ~ sum(is.na(.x)))))## # A tibble: 3 × 7
## species bill_length_mm_m… bill_length_mm_… bill_depth_mm_m… bill_depth_mm_n…
## <fct> <dbl> <int> <dbl> <int>
## 1 Adelie 38.8 1 18.3 1
## 2 Chinstrap 48.8 0 18.4 0
## 3 Gentoo 47.5 1 15.0 1
## # … with 2 more variables: flipper_length_mm_mean <dbl>,
## # flipper_length_mm_n_miss <int>
## Rows: 8031 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): Date / Time, City, State, Shape, Duration, Summary, Posted
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 8,031 × 7
## `Date / Time` City State Shape Duration Summary Posted
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 12/12/14 17:30 North Wales PA Triang… 5 minut… "I hea… <NA>
## 2 12/12/14 12:40 Cartersville GA Unknown 3.6 min… "Looki… 12/12…
## 3 12/12/14 06:30 Isle of Man (UK/England) <NA> Light 2 secon… "Over … 12/12…
## 4 12/12/14 01:00 Miamisburg OH Changi… <NA> "Brigh… 12/12…
## 5 12/12/14 00:00 Spotsylvania VA Unknown 1 minute "White… 12/12…
## 6 12/11/14 23:25 Kenner LA Chevron ~1 minu… "Stran… 12/12…
## 7 12/11/14 23:15 Eugene OR Disk 2 minut… "Dual … 12/12…
## 8 12/11/14 20:04 Phoenix AZ Chevron 3 minut… "4 Ora… 12/12…
## 9 12/11/14 20:00 Franklin NC Disk 5 minut… "There… 12/12…
## 10 12/11/14 18:30 Longview WA Cylind… 10 seco… "Two c… 12/12…
## # … with 8,021 more rows
ufo_man <- read_csv("https://raw.githubusercontent.com/lebebr01/iowa_data_science/master/data/ufo.csv",
skip = 1,
col_names = c('Date/Time', 'City', 'State',
'Shape', 'Duration', 'Summary',
'Posted'))## Rows: 8031 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): Date/Time, City, State, Shape, Duration, Summary, Posted
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 8,031 × 7
## `Date/Time` City State Shape Duration Summary Posted
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 12/12/14 17:30 North Wales PA Triang… 5 minut… "I hea… <NA>
## 2 12/12/14 12:40 Cartersville GA Unknown 3.6 min… "Looki… 12/12…
## 3 12/12/14 06:30 Isle of Man (UK/England) <NA> Light 2 secon… "Over … 12/12…
## 4 12/12/14 01:00 Miamisburg OH Changi… <NA> "Brigh… 12/12…
## 5 12/12/14 00:00 Spotsylvania VA Unknown 1 minute "White… 12/12…
## 6 12/11/14 23:25 Kenner LA Chevron ~1 minu… "Stran… 12/12…
## 7 12/11/14 23:15 Eugene OR Disk 2 minut… "Dual … 12/12…
## 8 12/11/14 20:04 Phoenix AZ Chevron 3 minut… "4 Ora… 12/12…
## 9 12/11/14 20:00 Franklin NC Disk 5 minut… "There… 12/12…
## 10 12/11/14 18:30 Longview WA Cylind… 10 seco… "Two c… 12/12…
## # … with 8,021 more rows
ufo_date <- read_csv("Data/ufo.csv",
col_types = list(
'Date / Time' = col_datetime(format = "%m/%d/%y %H:%M"),
City = col_character(),
State = col_character(),
Shape = col_character(),
Duration = col_character(),
Summary = col_character(),
Posted = col_character()
))## Warning: One or more parsing issues, see `problems()` for details
## # A tibble: 8,031 × 7
## `Date / Time` City State Shape Duration Summary Posted
## <dttm> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2014-12-12 17:30:00 North Wales PA Tria… 5 minut… "I hea… <NA>
## 2 2014-12-12 12:40:00 Cartersville GA Unkn… 3.6 min… "Looki… 12/12…
## 3 2014-12-12 06:30:00 Isle of Man (UK/Engl… <NA> Light 2 secon… "Over … 12/12…
## 4 2014-12-12 01:00:00 Miamisburg OH Chan… <NA> "Brigh… 12/12…
## 5 2014-12-12 00:00:00 Spotsylvania VA Unkn… 1 minute "White… 12/12…
## 6 2014-12-11 23:25:00 Kenner LA Chev… ~1 minu… "Stran… 12/12…
## 7 2014-12-11 23:15:00 Eugene OR Disk 2 minut… "Dual … 12/12…
## 8 2014-12-11 20:04:00 Phoenix AZ Chev… 3 minut… "4 Ora… 12/12…
## 9 2014-12-11 20:00:00 Franklin NC Disk 5 minut… "There… 12/12…
## 10 2014-12-11 18:30:00 Longview WA Cyli… 10 seco… "Two c… 12/12…
## # … with 8,021 more rows
## # A tibble: 56 × 5
## row col expected actual file
## <int> <int> <chr> <chr> <chr>
## 1 120 1 date like %m/%d/%y %H:%M 12/1/14 /Users/bleb/Library/CloudStora…
## 2 195 1 date like %m/%d/%y %H:%M 11/27/14 /Users/bleb/Library/CloudStora…
## 3 237 1 date like %m/%d/%y %H:%M 11/24/14 /Users/bleb/Library/CloudStora…
## 4 408 1 date like %m/%d/%y %H:%M 11/15/14 /Users/bleb/Library/CloudStora…
## 5 666 1 date like %m/%d/%y %H:%M 10/31/14 /Users/bleb/Library/CloudStora…
## 6 798 1 date like %m/%d/%y %H:%M 10/25/14 /Users/bleb/Library/CloudStora…
## 7 947 1 date like %m/%d/%y %H:%M 10/19/14 /Users/bleb/Library/CloudStora…
## 8 1082 1 date like %m/%d/%y %H:%M 10/14/14 /Users/bleb/Library/CloudStora…
## 9 1123 1 date like %m/%d/%y %H:%M 10/12/14 /Users/bleb/Library/CloudStora…
## 10 1124 1 date like %m/%d/%y %H:%M 10/12/14 /Users/bleb/Library/CloudStora…
## # … with 46 more rows
read_tsvread_fwfread_tableread_delimfile.choose(). For example,
read_tsv(file.choose()).
read_excel## # A tibble: 891 × 12
## PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin
## <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <chr>
## 1 1 0 3 Braun… male 22 1 0 A/5 2… 7.25 <NA>
## 2 2 1 1 Cumin… fema… 38 1 0 PC 17… 71.3 C85
## 3 3 1 3 Heikk… fema… 26 0 0 STON/… 7.92 <NA>
## 4 4 1 1 Futre… fema… 35 1 0 113803 53.1 C123
## 5 5 0 3 Allen… male 35 0 0 373450 8.05 <NA>
## 6 6 0 3 Moran… male NA 0 0 330877 8.46 <NA>
## 7 7 0 1 McCar… male 54 0 0 17463 51.9 E46
## 8 8 0 3 Palss… male 2 3 1 349909 21.1 <NA>
## 9 9 1 3 Johns… fema… 27 0 2 347742 11.1 <NA>
## 10 10 1 2 Nasse… fema… 14 1 0 237736 30.1 <NA>
## # … with 881 more rows, and 1 more variable: Embarked <chr>